Urllib库

内置的HTTP请求库

  • 请求模块: urllib.request
  • 异常处理模块: urllib.error
  • url解析模块:urllib.parse
  • robots.txt解析模块:urllib.robotparser
  • 代理: ProxyHandler (翻墙)
  • Cookie: http.cookiejar
  • 网址拼接: urljoin
  • 将字典转换成请求参数: urlencode

urllib

urlopen

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

1
2
3
4
import urllib.request  #请求模块

response = urllib.request.urlopen('http://www.baidu.com') #打开这个url地址,get请求
print(response.read().decode('utf-8')) #读取源代码,这里解码用utf-8
1
<!DOCTYPE html><!--STATUS OK-->


百度一下,你就知道



1
    </html>
1
2
3
4
5
6
import urllib.parse  #url解析模块
import urllib.request

data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8')  #转换成二进制,这里传递过去word以及hello
response = urllib.request.urlopen('http://httpbin.org/post', data=data)#加上data是一种post请求
print(response.read())
1
b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "word": "hello"\n  }, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Content-Length": "10", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6", \n    "X-Amzn-Trace-Id": "Root=1-5f0d3ca2-bb53229b018e17799e02b1ae"\n  }, \n  "json": null, \n  "origin": "183.207.182.162", \n  "url": "http://httpbin.org/post"\n}\n'
1
2
3
4
import urllib.request

response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) #timeout超时响应时间
print(response.read())
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
---------------------------------------------------------------------------

timeout                                   Traceback (most recent call last)

<ipython-input-9-624debaefd14> in <module>
      1 import urllib.request
      2 
----> 3 response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) #timeout超时响应时间
      4 print(response.read())


D:\Anaconda3\envs\CPU\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):


D:\Anaconda3\envs\CPU\lib\urllib\request.py in open(self, fullurl, data, timeout)
    524             req = meth(req)
    525 
--> 526         response = self._open(req, data)
    527 
    528         # post-process response


D:\Anaconda3\envs\CPU\lib\urllib\request.py in _open(self, req, data)
    542         protocol = req.type
    543         result = self._call_chain(self.handle_open, protocol, protocol +
--> 544                                   '_open', req)
    545         if result:
    546             return result


D:\Anaconda3\envs\CPU\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result


D:\Anaconda3\envs\CPU\lib\urllib\request.py in http_open(self, req)
   1344 
   1345     def http_open(self, req):
-> 1346         return self.do_open(http.client.HTTPConnection, req)
   1347 
   1348     http_request = AbstractHTTPHandler.do_request_


D:\Anaconda3\envs\CPU\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
   1319             except OSError as err: # timeout error
   1320                 raise URLError(err)
-> 1321             r = h.getresponse()
   1322         except:
   1323             h.close()


D:\Anaconda3\envs\CPU\lib\http\client.py in getresponse(self)
   1352         try:
   1353             try:
-> 1354                 response.begin()
   1355             except ConnectionError:
   1356                 self.close()


D:\Anaconda3\envs\CPU\lib\http\client.py in begin(self)
    305         # read until we get a non-100 response
    306         while True:
--> 307             version, status, reason = self._read_status()
    308             if status != CONTINUE:
    309                 break


D:\Anaconda3\envs\CPU\lib\http\client.py in _read_status(self)
    266 
    267     def _read_status(self):
--> 268         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    269         if len(line) > _MAXLINE:
    270             raise LineTooLong("status line")


D:\Anaconda3\envs\CPU\lib\socket.py in readinto(self, b)
    584         while True:
    585             try:
--> 586                 return self._sock.recv_into(b)
    587             except timeout:
    588                 self._timeout_occurred = True


timeout: timed out
1
2
3
4
5
6
7
8
9
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)
except urllib.error.URLError as e:  #捕获异常
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')
1
TIME OUT

响应

响应类型

1
2
3
4
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(type(response))  #响应的类型
1
<class 'http.client.HTTPResponse'>

状态码、响应头

1
2
3
4
5
6
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(response.status)  #状态码
print(response.getheaders()) #响应头
print(response.getheader('Server'))  #使用的服务器的类型
1
2
3
200
[('Connection', 'close'), ('Content-Length', '48997'), ('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'DENY'), ('Via', '1.1 vegur'), ('Via', '1.1 varnish'), ('Accept-Ranges', 'bytes'), ('Date', 'Tue, 14 Jul 2020 05:03:52 GMT'), ('Via', '1.1 varnish'), ('Age', '3128'), ('X-Served-By', 'cache-bwi5126-BWI, cache-hkg17920-HKG'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '42, 1733'), ('X-Timer', 'S1594703032.137189,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')]
nginx
1
2
3
4
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(response.read().decode('utf-8')) #response.read()获取响应体的内容
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
<!doctype html>
<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->
<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
<!--[if IE 8]>      <html class="no-js ie8 lt-ie9">                 <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en" dir="ltr">  <!--<![endif]-->

<head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">

    <meta name="application-name" content="Python.org">
    <meta name="msapplication-tooltip" content="The official home of the Python Programming Language">
    <meta name="apple-mobile-web-app-title" content="Python.org">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black">

    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta name="HandheldFriendly" content="True">
    <meta name="format-detection" content="telephone=no">
    <meta http-equiv="cleartype" content="on">
    <meta http-equiv="imagetoolbar" content="false">

    <script src="/static/js/libs/modernizr.js"></script>

    <link href="/static/stylesheets/style.30afed881237.css" rel="stylesheet" type="text/css" title="default" />
    <link href="/static/stylesheets/mq.eef77a5d2257.css" rel="stylesheet" type="text/css" media="not print, braille, embossed, speech, tty" />


<!--[if (lte IE 8)&(!IEMobile)]>


<![endif]→



1
2
3
4
    <title>Welcome to Python.org</title>

    <meta name="description" content="The official home of the Python Programming Language">
    <meta name="keywords" content="Python programming language object oriented web free open source software license documentation download community">


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
    <meta property="og:image" content="https://www.python.org/static/opengraph-icon-200x200.png">
    <meta property="og:image:secure_url" content="https://www.python.org/static/opengraph-icon-200x200.png">

    <meta property="og:url" content="https://www.python.org/">

    <link rel="author" href="/static/humans.txt">

    <link rel="alternate" type="application/rss+xml" title="Python Enhancement Proposals"
          href="https://www.python.org/dev/peps/peps.rss/">
    <link rel="alternate" type="application/rss+xml" title="Python Job Opportunities"
          href="https://www.python.org/jobs/feed/rss/">
    <link rel="alternate" type="application/rss+xml" title="Python Software Foundation News"
          href="https://feeds.feedburner.com/PythonSoftwareFoundationNews">
    <link rel="alternate" type="application/rss+xml" title="Python Insider"
          href="https://feeds.feedburner.com/PythonInsider">





1
2
3
4
    <script src="/static/js/libs/masonry.pkgd.min.js"></script>
    <script src="/static/js/libs/html-includes.js"></script>

    <script type="text/javascript" src="/static/js/main-min.a3326162e3f0.js" charset="utf-8"></script>


<!--[if lte IE 7]>


<![endif]→

1
2
    <!--[if lte IE 8]>
    <script type="text/javascript" src="/static/js/plugins/getComputedStyle-min.c3860be1d290.js" charset="utf-8"></script>


<![endif]→






Request

1
2
3
4
5
import urllib.request

request = urllib.request.Request('https://python.org')  #声明一个request对象(万一太长),将url构造成request更加的方便,可以添加额外的数据
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
<!doctype html>
<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->
<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
<!--[if IE 8]>      <html class="no-js ie8 lt-ie9">                 <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en" dir="ltr">  <!--<![endif]-->

<head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">

    <meta name="application-name" content="Python.org">
    <meta name="msapplication-tooltip" content="The official home of the Python Programming Language">
    <meta name="apple-mobile-web-app-title" content="Python.org">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black">

    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta name="HandheldFriendly" content="True">
    <meta name="format-detection" content="telephone=no">
    <meta http-equiv="cleartype" content="on">
    <meta http-equiv="imagetoolbar" content="false">

    <script src="/static/js/libs/modernizr.js"></script>

    <link href="/static/stylesheets/style.30afed881237.css" rel="stylesheet" type="text/css" title="default" />
    <link href="/static/stylesheets/mq.eef77a5d2257.css" rel="stylesheet" type="text/css" media="not print, braille, embossed, speech, tty" />


<!--[if (lte IE 8)&(!IEMobile)]>


<![endif]→



1
2
3
4
    <title>Welcome to Python.org</title>

    <meta name="description" content="The official home of the Python Programming Language">
    <meta name="keywords" content="Python programming language object oriented web free open source software license documentation download community">


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
    <meta property="og:image" content="https://www.python.org/static/opengraph-icon-200x200.png">
    <meta property="og:image:secure_url" content="https://www.python.org/static/opengraph-icon-200x200.png">

    <meta property="og:url" content="https://www.python.org/">

    <link rel="author" href="/static/humans.txt">

    <link rel="alternate" type="application/rss+xml" title="Python Enhancement Proposals"
          href="https://www.python.org/dev/peps/peps.rss/">
    <link rel="alternate" type="application/rss+xml" title="Python Job Opportunities"
          href="https://www.python.org/jobs/feed/rss/">
    <link rel="alternate" type="application/rss+xml" title="Python Software Foundation News"
          href="https://feeds.feedburner.com/PythonSoftwareFoundationNews">
    <link rel="alternate" type="application/rss+xml" title="Python Insider"
          href="https://feeds.feedburner.com/PythonInsider">





1
2
3
4
    <script src="/static/js/libs/masonry.pkgd.min.js"></script>
    <script src="/static/js/libs/html-includes.js"></script>

    <script type="text/javascript" src="/static/js/main-min.a3326162e3f0.js" charset="utf-8"></script>


<!--[if lte IE 7]>


<![endif]→

1
2
    <!--[if lte IE 8]>
    <script type="text/javascript" src="/static/js/plugins/getComputedStyle-min.c3860be1d290.js" charset="utf-8"></script>


<![endif]→






 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
from urllib import request, parse

# 构建一个复杂的request对象
url = 'http://httpbin.org/post' #url地址
headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    'Host': 'httpbin.org'
}
dict = {
    'name': 'Germey'
}  
data = bytes(parse.urlencode(dict), encoding='utf8') #数据
req = request.Request(url=url, data=data, headers=headers, method='POST') #采用的是post请求
response = request.urlopen(req) #进行请求
print(response.read().decode('utf-8')) #打印响应体
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "Germey"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "11", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", 
    "X-Amzn-Trace-Id": "Root=1-5f0d3cbe-7f4e5b70816dee504bfad800"
  }, 
  "json": null, 
  "origin": "183.207.182.162", 
  "url": "http://httpbin.org/post"
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
from urllib import request, parse

url = 'http://httpbin.org/post'
dict = {
    'name': 'Germey'
}
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url, data=data, method='POST')
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') #添加请求头方法,一次只能添加一条吧
response = request.urlopen(req)
print(response.read().decode('utf-8'))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "Germey"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "11", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", 
    "X-Amzn-Trace-Id": "Root=1-5f0d3cbf-f76d4b20926966007d005540"
  }, 
  "json": null, 
  "origin": "183.207.182.162", 
  "url": "http://httpbin.org/post"
}

Handler

代理

1
2
3
4
5
6
7
8
9
import urllib.request

proxy_handler = urllib.request.ProxyHandler({
    'http': 'http://127.0.0.1:9743',
    'https': 'https://127.0.0.1:9743'
})  #构造代理
opener = urllib.request.build_opener(proxy_handler)  #参数传递构建opener
response = opener.open('http://httpbin.org/get')
print(response.read())
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
---------------------------------------------------------------------------

ConnectionRefusedError                    Traceback (most recent call last)

D:\Anaconda3\envs\CPU\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
   1317                 h.request(req.get_method(), req.selector, req.data, headers,
-> 1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error


D:\Anaconda3\envs\CPU\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
   1261         """Send a complete request to the server."""
-> 1262         self._send_request(method, url, body, headers, encode_chunked)
   1263


D:\Anaconda3\envs\CPU\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1307             body = _encode(body, 'body')
-> 1308         self.endheaders(body, encode_chunked=encode_chunked)
   1309


D:\Anaconda3\envs\CPU\lib\http\client.py in endheaders(self, message_body, encode_chunked)
   1256             raise CannotSendHeader()
-> 1257         self._send_output(message_body, encode_chunked=encode_chunked)
   1258


D:\Anaconda3\envs\CPU\lib\http\client.py in _send_output(self, message_body, encode_chunked)
   1035         del self._buffer[:]
-> 1036         self.send(msg)
   1037


D:\Anaconda3\envs\CPU\lib\http\client.py in send(self, data)
    973             if self.auto_open:
--> 974                 self.connect()
    975             else:


D:\Anaconda3\envs\CPU\lib\http\client.py in connect(self)
    945         self.sock = self._create_connection(
--> 946             (self.host,self.port), self.timeout, self.source_address)
    947         self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)


D:\Anaconda3\envs\CPU\lib\socket.py in create_connection(address, timeout, source_address)
    723     if err is not None:
--> 724         raise err
    725     else:


D:\Anaconda3\envs\CPU\lib\socket.py in create_connection(address, timeout, source_address)
    712                 sock.bind(source_address)
--> 713             sock.connect(sa)
    714             # Break explicitly a reference cycle


ConnectionRefusedError: [WinError 10061] 由于目标计算机积极拒绝,无法连接。


During handling of the above exception, another exception occurred:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
URLError                                  Traceback (most recent call last)

<ipython-input-17-ccf31deda840> in <module>
      6 })  #构造代理
      7 opener = urllib.request.build_opener(proxy_handler)  #参数传递构建opener
----> 8 response = opener.open('http://httpbin.org/get')
      9 print(response.read())


D:\Anaconda3\envs\CPU\lib\urllib\request.py in open(self, fullurl, data, timeout)
    524             req = meth(req)
    525 
--> 526         response = self._open(req, data)
    527 
    528         # post-process response


D:\Anaconda3\envs\CPU\lib\urllib\request.py in _open(self, req, data)
    542         protocol = req.type
    543         result = self._call_chain(self.handle_open, protocol, protocol +
--> 544                                   '_open', req)
    545         if result:
    546             return result


D:\Anaconda3\envs\CPU\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result


D:\Anaconda3\envs\CPU\lib\urllib\request.py in http_open(self, req)
   1344 
   1345     def http_open(self, req):
-> 1346         return self.do_open(http.client.HTTPConnection, req)
   1347 
   1348     http_request = AbstractHTTPHandler.do_request_


D:\Anaconda3\envs\CPU\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
   1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error
-> 1320                 raise URLError(err)
   1321             r = h.getresponse()
   1322         except:


URLError: <urlopen error [WinError 10061] 由于目标计算机积极拒绝,无法连接。>
1
2
3
4
5
6
7
8
import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar() #保持我们的登录会话信息
handler = urllib.request.HTTPCookieProcessor(cookie) #也算是一个代理
opener = urllib.request.build_opener(handler) #使用的是 opener
response = opener.open('http://www.baidu.com')
for item in cookie:  #遍历循环打印cookie
    print(item.name+"="+item.value)
1
2
3
4
5
6
BAIDUID=902E910CE0522376501945EC6F6E5152:FG=1
BIDUPSID=902E910CE0522376E5A99057D00C59A0
H_PS_PSSID=32100_1431_31326_32139_31660_32045_32230_32257_31639
PSTM=1594703047
BDSVRTM=0
BD_HOME=1
1
2
3
4
5
6
7
8
#保存  火狐浏览器  登录会话信息到本地
import http.cookiejar, urllib.request
filename = "cookie.txt"  #cookie保存的名字
cookie = http.cookiejar.MozillaCookieJar(filename) #使用MozillaCookieJar 声明成cookie对象
handler = urllib.request.HTTPCookieProcessor(cookie) #使用handler代理
opener = urllib.request.build_opener(handler) #创建opener对象
response = opener.open('http://www.baidu.com') #打开网址
cookie.save(ignore_discard=True, ignore_expires=True) #对cookie进行保存
1
2
3
4
5
6
7
8
# 保存  另外一种格式的  登录会话信息到本地
import http.cookiejar, urllib.request
filename = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)
1
2
3
4
5
6
7
8
# 采用哪种方式进行保存cookie,那么久采用对应的方法进行读取就好了
import http.cookiejar, urllib.request
cookie = http.cookiejar.LWPCookieJar() #常见对象
cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True) #进行加载
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
1
<!DOCTYPE html><!--STATUS OK-->


百度一下,你就知道



1
    </html>

异常处理

1
2
3
4
5
from urllib import request, error
try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.URLError as e:  #不去捕捉异常会导致程序直接die
    print(e.reason) #打印原因
1
Not Found
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
from urllib import request, error

try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:  #子类的异常
    print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e: #父类的异常
    print(e.reason)
else:
    print('Request Successfully')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
Not Found
404
Server: nginx/1.10.3 (Ubuntu)
Date: Tue, 14 Jul 2020 05:04:17 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Connection: close
Set-Cookie: PHPSESSID=71mv8t8g9933cncca9uaa5fiv7; path=/
Pragma: no-cache
Vary: Cookie
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Cache-Control: no-cache, must-revalidate, max-age=0
Link: <https://cuiqingcai.com/wp-json/>; rel="https://api.w.org/"


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://www.baidu.com', timeout=0.01)
except urllib.error.URLError as e:
    print(type(e.reason))  #打印类型
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')
1
2
<class 'socket.timeout'>
TIME OUT

URL解析

urlparse

urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)

1
2
3
4
5
# 对 url地址进行解析处理
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') #将url划分为六个标准的结构
print(type(result), result)
1
<class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
1
2
3
4
from urllib.parse import urlparse

result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https')  #schme没有的时候才会使用默认的
print(result)
1
ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment')
1
2
3
4
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme='https')
print(result)
1
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
1
2
3
4
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)
print(result)
1
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')
1
2
3
4
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False) # allow_fragments忽略fragment使其向前拼接
print(result)
1
ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')

urlunparse

1
2
3
4
from urllib.parse import urlunparse

data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment'] #进行拼接
print(urlunparse(data))
1
http://www.baidu.com/index.html;user?a=6#comment

urljoin

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
from urllib.parse import urljoin

print(urljoin('http://www.baidu.com', 'FAQ.html'))
print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html')) #后面的覆盖前面的
print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html'))
print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2')) #后面没有的才会从前面找来填充自己
print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php'))
print(urljoin('http://www.baidu.com', '?category=2#comment'))
print(urljoin('www.baidu.com', '?category=2#comment'))
print(urljoin('www.baidu.com#comment', '?category=2'))
1
2
3
4
5
6
7
8
http://www.baidu.com/FAQ.html
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/FAQ.html?question=2
https://cuiqingcai.com/index.php
http://www.baidu.com?category=2#comment
www.baidu.com?category=2#comment
www.baidu.com?category=2

urlencode

1
2
3
4
5
6
7
8
9
from urllib.parse import urlencode

params = {
    'name': 'germey',
    'age': 22
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params) #将字典转换成请求参数
print(url)
1
http://www.baidu.com?name=germey&age=22
1